library(tidyverse)
library(corrplot)
library(ggthemes)
library(e1071)
library(caret)
library(class)
library(leaps)
library(car)
library(plotly)
library(mvtnorm)
## ID Age Attrition BusinessTravel DailyRate Department
## 1 1 32 No Travel_Rarely 117 Sales
## 2 2 40 No Travel_Rarely 1308 Research & Development
## 3 3 35 No Travel_Frequently 200 Research & Development
## 4 4 32 No Travel_Rarely 801 Sales
## 5 5 24 No Travel_Frequently 567 Research & Development
## 6 6 27 No Travel_Frequently 294 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 13 4 Life Sciences 1 859
## 2 14 3 Medical 1 1128
## 3 18 2 Life Sciences 1 1412
## 4 1 4 Marketing 1 2016
## 5 2 1 Technical Degree 1 1646
## 6 10 2 Life Sciences 1 733
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 2 Male 73 3 2
## 2 3 Male 44 2 5
## 3 3 Male 60 3 3
## 4 3 Female 48 3 3
## 5 1 Female 32 3 1
## 6 4 Male 32 3 3
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1 Sales Executive 4 Divorced 4403
## 2 Research Director 3 Single 19626
## 3 Manufacturing Director 4 Single 9362
## 4 Sales Executive 4 Married 10422
## 5 Research Scientist 4 Single 3760
## 6 Manufacturing Director 1 Divorced 8793
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1 9250 2 Y No 11
## 2 17544 1 Y No 14
## 3 19944 2 Y No 11
## 4 24032 1 Y No 19
## 5 17218 1 Y Yes 13
## 6 4809 1 Y No 21
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1 3 3 80 1
## 2 3 1 80 0
## 3 3 3 80 0
## 4 3 3 80 2
## 5 3 3 80 0
## 6 4 3 80 2
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 3 2 5
## 2 21 2 4 20
## 3 10 2 3 2
## 4 14 3 3 14
## 5 6 2 3 6
## 6 9 4 2 9
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1 2 0 3
## 2 7 4 9
## 3 2 2 2
## 4 10 5 7
## 5 3 1 3
## 6 7 1 7
After importing the data, we should first look for any missing values.
#Find out which columns have missing values
names(which(colSums(is.na(cs2))>0))
## character(0)
There does not appear to be any missing values in the entire dataset. We can summarize all of the variables to get an idea of their scales and spreads, as well as identify any that can be removed.
summary(cs2)
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 No :730 Non-Travel : 94
## 1st Qu.:218.2 1st Qu.:30.00 Yes:140 Travel_Frequently:158
## Median :435.5 Median :35.00 Travel_Rarely :618
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
##
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Human Resources : 35 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Research & Development:562 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Sales :273 Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
##
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Human Resources : 15 Min. :1 Min. : 1.0 Min. :1.000
## Life Sciences :358 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Marketing :100 Median :1 Median :1039.0 Median :3.000
## Medical :270 Mean :1 Mean :1029.8 Mean :2.701
## Other : 52 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Technical Degree: 75 Max. :1 Max. :2064.0 Max. :4.000
##
## Gender HourlyRate JobInvolvement JobLevel
## Female:354 Min. : 30.00 Min. :1.000 Min. :1.000
## Male :516 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
##
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Sales Executive :200 Min. :1.000 Divorced:191 Min. : 1081
## Research Scientist :172 1st Qu.:2.000 Married :410 1st Qu.: 2840
## Laboratory Technician :153 Median :3.000 Single :269 Median : 4946
## Manufacturing Director : 87 Mean :2.709 Mean : 6390
## Healthcare Representative: 76 3rd Qu.:4.000 3rd Qu.: 8182
## Sales Representative : 53 Max. :4.000 Max. :19999
## (Other) :129
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. : 2094 Min. :0.000 Y:870 No :618 Min. :11.0
## 1st Qu.: 8092 1st Qu.:1.000 Yes:252 1st Qu.:12.0
## Median :14074 Median :2.000 Median :14.0
## Mean :14326 Mean :2.728 Mean :15.2
## 3rd Qu.:20456 3rd Qu.:4.000 3rd Qu.:18.0
## Max. :26997 Max. :9.000 Max. :25.0
##
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.152 Mean :2.707 Mean :80 Mean :0.7839
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.05 Mean :2.832 Mean :2.782 Mean : 6.962
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:10.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 3.000 Median : 1.000 Median : 3.00
## Mean : 4.205 Mean : 2.169 Mean : 4.14
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.00
## Max. :18.000 Max. :15.000 Max. :17.00
##
Prediction models can obviously exclude superfluous data such as employee identifiers. We can also see from the summaries that Standard Hours and Over18 can also be removed as they are the same for every subject in the dataset.
cs2m <- select(cs2, -c("ID", "EmployeeCount", "EmployeeNumber", "StandardHours", "Over18"))
A correlation matrix can be used to observe collinearity of the continuous variables in the data. This is useful for both the attrition and salary analysis.
#Plot numeric variables v numeric variables
cs2m %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", tl.srt=45, tl.cex = 0.6)
We can see from the correlation matrix that there are a number of postively correlated variables.This could help us later when we are trying to predict Salaries.
We can look at density Curves to see which numeric variables could have effects on a certain categorical outcome. These could be useful for the attrition analysis, where we are trying to classify employees into 2 groups by attrition likelihood.
densityPlots <- function(df, explanatory, response){
df %>% ggplot(aes_string(x = explanatory, fill = response)) + geom_density(alpha=0.5)
}
densityPlotsList <- lapply(cs2m %>% keep(is.numeric) %>% colnames, function(x) densityPlots(cs2m, x, "Attrition"))
for(i in densityPlotsList){
print(i)
}
#densityPlots(cs2m, "Age", "Attrition")
For categorical variables we can use a variable review grid that can visualize trends and differences in categorical preditors and the response. In this case, we are looking at the differences in attrition for each variable.
# 1. Name target variable
targetCatCat <- "Attrition"
# 2. Name explanatory variable
explanatory <- cs2m %>% keep(is.factor) %>% colnames
# 3. Create function
numCatCat <- function(df, explanatory, response) {
ggplot(data = df) +geom_bar(aes_string(x = explanatory, fill = response), position = "fill", alpha = 0.9) + coord_flip() + xlab(explanatory)
}
# 4. Create plot list for plot_grid function to reference
plotlistCatCat <- lapply(explanatory, function(x) numCatCat(cs2m, x, targetCatCat))
# output plots in a loop
for(i in plotlistCatCat){
print(i)
}
The results show that there are some clear separations between levels of most of the categorical variables. It appears that the best course of action intially is to include all of these variables when building any models.
Before we start with either analysis we can split up the dataset into a training and test set for validating any models we run.
#split into training and test sets for cv. Dataset is 870 obs; split in half with 435
set.seed(1234)
index<-sample(1:dim(cs2m)[1],435,replace=F)
train<-cs2m[index,]
test<-cs2m[-index,]
library(klaR)
The function below will train a Naive Bayes model using internal cross validation and list the top predictors of all of the available variables.
x = cs2m[,-2]
y = cs2m$Attrition
control <- rfeControl(functions=nbFuncs, method="cv", number=100) #nbFuncs is for Naive Bayes
results <- rfe(x, y, rfeControl=control)
predictors <- predictors(results) #save the selected predicors so we can re-run the model with them
predictors
## [1] "OverTime" "MonthlyIncome"
## [3] "TotalWorkingYears" "YearsAtCompany"
## [5] "StockOptionLevel" "MaritalStatus"
## [7] "JobLevel" "YearsInCurrentRole"
## [9] "YearsWithCurrManager" "Age"
## [11] "JobInvolvement" "JobSatisfaction"
## [13] "JobRole" "Department"
## [15] "DistanceFromHome" "EnvironmentSatisfaction"
plot(results, type=c("g", "o")) #show a plot of accuracy vs number of predictors (we are mode concerned with Sens and Spec)
The plot shows model accuracy vs number of predictors added and it looks like 15-16 predictors gets good accuracy before we get into overfitting problems. However, we are more concerned with the specificity (or number of “yes” responses for attrition) since there are far fewer of them in the data. We can run models on test and train sets using whatever number of the top predictors we want from the selection done above.Then we can run the model on the full dataset.
#Accuracy, Sensitivity and Specificity of Model on Internal train and test partitions
model = naiveBayes(train[predictors[1:16]],train$Attrition)
confusionMatrix(table(predict(model,test[predictors[1:16]]), test$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 328 50
## Yes 24 33
##
## Accuracy : 0.8299
## 95% CI : (0.7912, 0.864)
## No Information Rate : 0.8092
## P-Value [Acc > NIR] : 0.149537
##
## Kappa : 0.3742
##
## Mcnemar's Test P-Value : 0.003659
##
## Sensitivity : 0.9318
## Specificity : 0.3976
## Pos Pred Value : 0.8677
## Neg Pred Value : 0.5789
## Prevalence : 0.8092
## Detection Rate : 0.7540
## Detection Prevalence : 0.8690
## Balanced Accuracy : 0.6647
##
## 'Positive' Class : No
##
#Accuracy, Sensitivity and Specificity of Model on Training set (Full Dataset)
fullmodel = naiveBayes(cs2m[predictors[1:16]],cs2m$Attrition)
confusionMatrix(table(predict(fullmodel,cs2m[predictors[1:16]]), cs2m$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 624 54
## Yes 106 86
##
## Accuracy : 0.8161
## 95% CI : (0.7887, 0.8413)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.969
##
## Kappa : 0.4079
##
## Mcnemar's Test P-Value : 5.533e-05
##
## Sensitivity : 0.8548
## Specificity : 0.6143
## Pos Pred Value : 0.9204
## Neg Pred Value : 0.4479
## Prevalence : 0.8391
## Detection Rate : 0.7172
## Detection Prevalence : 0.7793
## Balanced Accuracy : 0.7345
##
## 'Positive' Class : No
##
The problem with this is that ther are so many fewer data points for “yes” (employees that left). This makes it difficult to achieve a high percentage of “yes” responses.
We can use a random oversampling technique by way of the ROSE package to synthetically balance the yes and no responses. This could potentially reduce overall model accuracy, but our goal is to improve the low specificity (correct “yes”) of the model.
library(ROSE)
## Warning: package 'ROSE' was built under R version 3.6.3
## Loaded ROSE 0.0-3
table(cs2m$Attrition)
##
## No Yes
## 730 140
cs2m.balanced <- ROSE(Attrition~., data = cs2m)$data
table(cs2m.balanced$Attrition)
##
## No Yes
## 414 456
Now we have a dataset that is more balanced to run the model. We can split it into new oversampled training and test sets.
#split into training and test sets for cv. Dataset is 870 obs; split in half with 435
set.seed(1234)
index<-sample(1:dim(cs2m)[1],435,replace=F)
train.os<-cs2m.balanced[index,]
test.os<-cs2m.balanced[-index,]
Then we can run the NB model again on the balanced sets.
x = train.os[,-2]
y = train.os$Attrition
control <- rfeControl(functions=nbFuncs, method="cv", number=50) #nbFuncs is for Naive Bayes
results <- rfe(x, y, rfeControl=control)
predictors <- predictors(results) #save the selected predicors so we can re-run the model with them
predictors
## [1] "OverTime" "MaritalStatus" "JobInvolvement"
## [4] "StockOptionLevel" "TotalWorkingYears" "JobRole"
## [7] "MonthlyIncome" "JobSatisfaction"
The top predictors havent changed much, but the model built withe balanced data will likely be very different. We can trust the selection algorithm a bit more now that we are using balanced data. We can test the top 10 predictors to see what the results are for the balanced oversampled training and test partitions.
x = train.os[,-2]
y = train.os$Attrition
control <- rfeControl(functions=nbFuncs, method="cv", number=50) #nbFuncs is for Naive Bayes
results <- rfe(x, y, rfeControl=control)
predictors <- predictors(results)
model = naiveBayes(train.os[predictors[1:12]],train.os$Attrition)
confusionMatrix(table(predict(model,test.os[predictors[1:12]]), test.os$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 145 55
## Yes 66 169
##
## Accuracy : 0.7218
## 95% CI : (0.6772, 0.7635)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4423
##
## Mcnemar's Test P-Value : 0.3633
##
## Sensitivity : 0.6872
## Specificity : 0.7545
## Pos Pred Value : 0.7250
## Neg Pred Value : 0.7191
## Prevalence : 0.4851
## Detection Rate : 0.3333
## Detection Prevalence : 0.4598
## Balanced Accuracy : 0.7208
##
## 'Positive' Class : No
##
Running the model using the balanced training and test partitions has produced a more balanced sensitivity and specificity (but decreased overall accuracy). We can run the model again on the full balanced dataset and then predict the outcomes for the original dataset to see if we still get more balanced results.
#Accuracy, Sensitivity and Specificity of Model on Training set (Full Dataset)
fullmodel = naiveBayes(cs2m.balanced[predictors[1:12]],cs2m.balanced$Attrition)
confusionMatrix(table(predict(fullmodel,cs2m[predictors[1:12]]), cs2m$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 493 31
## Yes 237 109
##
## Accuracy : 0.692
## 95% CI : (0.6601, 0.7225)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2847
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6753
## Specificity : 0.7786
## Pos Pred Value : 0.9408
## Neg Pred Value : 0.3150
## Prevalence : 0.8391
## Detection Rate : 0.5667
## Detection Prevalence : 0.6023
## Balanced Accuracy : 0.7270
##
## 'Positive' Class : No
##
This model does have much lower accuracy, but the initial model with unbalanced data was producing biased results for overall accuracy because of the lower probability of “yes” outcomes. It could essentially pick “no” most of the time regardless of variable information and get good accuracy.
We can try a logistic regression model using a similar method. We can use the same oversampled training and test sets to get an idea of what the model can do.
fitControl <- trainControl(## 10-fold CV
method = "cv",
number = 10,
savePredictions = TRUE
)
logitmodel <- train(Attrition ~., data=train.os, method="glm", family=binomial(), trControl=fitControl)
varImp(logitmodel) #variable importance
## glm variable importance
##
## only 20 most important variables shown (out of 44)
##
## Overall
## OverTimeYes 100.00
## JobSatisfaction 65.06
## JobInvolvement 59.45
## RelationshipSatisfaction 48.57
## WorkLifeBalance 45.80
## MaritalStatusSingle 43.68
## YearsSinceLastPromotion 39.58
## GenderMale 36.82
## MaritalStatusMarried 33.75
## DailyRate 29.61
## `JobRoleResearch Director` 29.37
## JobLevel 26.65
## `JobRoleManufacturing Director` 24.81
## NumCompaniesWorked 23.71
## EnvironmentSatisfaction 23.56
## YearsWithCurrManager 22.23
## BusinessTravelTravel_Frequently 21.59
## `JobRoleLaboratory Technician` 21.33
## DistanceFromHome 19.92
## TotalWorkingYears 18.72
confusionMatrix(table(predict(logitmodel,test.os), test.os$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 152 58
## Yes 59 166
##
## Accuracy : 0.731
## 95% CI : (0.6867, 0.7722)
## No Information Rate : 0.5149
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4615
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7204
## Specificity : 0.7411
## Pos Pred Value : 0.7238
## Neg Pred Value : 0.7378
## Prevalence : 0.4851
## Detection Rate : 0.3494
## Detection Prevalence : 0.4828
## Balanced Accuracy : 0.7307
##
## 'Positive' Class : No
##
Again, the logistic model found that the top variables effecting attrition were similar to those from the Naive Bayes model. Based on the results, the logistic model may be a bit better, but first we can build a model with the entire balnced/oversampled set, and then predict against the original data.
fitControl <- trainControl(## 10-fold CV
method = "cv",
number = 50,
savePredictions = TRUE
)
logitmodel <- train(Attrition ~., data=cs2m.balanced, method="glm", family=binomial(), trControl=fitControl)
summary(logitmodel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0186 -0.7104 0.1763 0.6478 2.4061
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.908e+00 5.468e+02 -0.018 0.985541
## Age -3.818e-03 9.365e-03 -0.408 0.683489
## BusinessTravelTravel_Frequently 9.117e-01 3.999e-01 2.280 0.022636 *
## BusinessTravelTravel_Rarely 3.280e-01 3.412e-01 0.961 0.336428
## DailyRate -2.945e-04 1.829e-04 -1.610 0.107432
## `DepartmentResearch & Development` 1.448e+01 5.468e+02 0.026 0.978868
## DepartmentSales 1.570e+01 5.468e+02 0.029 0.977098
## DistanceFromHome 1.723e-02 9.459e-03 1.821 0.068557 .
## Education -9.536e-02 7.468e-02 -1.277 0.201648
## `EducationFieldLife Sciences` -2.601e+00 9.018e-01 -2.884 0.003925 **
## EducationFieldMarketing -2.389e+00 9.449e-01 -2.528 0.011468 *
## EducationFieldMedical -2.258e+00 8.934e-01 -2.527 0.011497 *
## EducationFieldOther -1.449e+00 9.915e-01 -1.462 0.143819
## `EducationFieldTechnical Degree` -1.766e+00 9.208e-01 -1.918 0.055174 .
## EnvironmentSatisfaction -2.090e-01 6.957e-02 -3.004 0.002665 **
## GenderMale 2.584e-01 1.968e-01 1.313 0.189111
## HourlyRate 2.140e-03 3.710e-03 0.577 0.564056
## JobInvolvement -6.052e-01 1.009e-01 -5.999 1.99e-09 ***
## JobLevel 3.753e-02 1.068e-01 0.351 0.725304
## `JobRoleHuman Resources` 1.438e+01 5.468e+02 0.026 0.979017
## `JobRoleLaboratory Technician` 5.501e-01 4.510e-01 1.220 0.222567
## JobRoleManager -1.031e+00 8.325e-01 -1.239 0.215322
## `JobRoleManufacturing Director` -2.278e+00 6.422e-01 -3.547 0.000390 ***
## `JobRoleResearch Director` -1.921e+00 6.771e-01 -2.837 0.004558 **
## `JobRoleResearch Scientist` -5.744e-02 4.383e-01 -0.131 0.895737
## `JobRoleSales Executive` -1.071e+00 1.060e+00 -1.011 0.312019
## `JobRoleSales Representative` 8.290e-01 1.125e+00 0.737 0.461380
## JobSatisfaction -4.791e-01 7.161e-02 -6.690 2.22e-11 ***
## MaritalStatusMarried 1.602e+00 2.969e-01 5.396 6.82e-08 ***
## MaritalStatusSingle 2.037e+00 3.438e-01 5.924 3.15e-09 ***
## MonthlyIncome 1.197e-05 2.698e-05 0.444 0.657305
## MonthlyRate 8.840e-06 1.107e-05 0.799 0.424400
## NumCompaniesWorked 4.971e-02 2.975e-02 1.671 0.094714 .
## OverTimeYes 1.751e+00 2.052e-01 8.534 < 2e-16 ***
## PercentSalaryHike -2.170e-02 2.245e-02 -0.967 0.333594
## PerformanceRating 3.295e-01 2.314e-01 1.424 0.154530
## RelationshipSatisfaction -2.297e-01 6.820e-02 -3.368 0.000757 ***
## StockOptionLevel -3.631e-02 9.475e-02 -0.383 0.701563
## TotalWorkingYears -1.168e-02 1.423e-02 -0.821 0.411880
## TrainingTimesLastYear -2.415e-01 6.493e-02 -3.719 0.000200 ***
## WorkLifeBalance -3.426e-01 1.020e-01 -3.359 0.000783 ***
## YearsAtCompany 3.249e-03 1.731e-02 0.188 0.851090
## YearsInCurrentRole -4.650e-02 2.765e-02 -1.681 0.092686 .
## YearsSinceLastPromotion 1.398e-01 2.807e-02 4.979 6.40e-07 ***
## YearsWithCurrManager -9.028e-02 2.732e-02 -3.305 0.000950 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1204.05 on 869 degrees of freedom
## Residual deviance: 757.87 on 825 degrees of freedom
## AIC: 847.87
##
## Number of Fisher Scoring iterations: 14
Finally, we can take out insignificant variables and re-run the model.
logitmodel <- train(Attrition ~ DailyRate + DistanceFromHome + Gender + JobInvolvement + JobRole + JobSatisfaction + MaritalStatus + NumCompaniesWorked + OverTime + TrainingTimesLastYear + YearsSinceLastPromotion + YearsWithCurrManager, data=cs2m.balanced, method="glm", family=binomial(), trControl=fitControl)
summary(logitmodel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7868 -0.7754 0.2412 0.7471 2.2935
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2690727 0.5608074 2.263 0.023639 *
## DailyRate -0.0003111 0.0001675 -1.857 0.063302 .
## DistanceFromHome 0.0120021 0.0087050 1.379 0.167965
## GenderMale 0.2563249 0.1797821 1.426 0.153940
## JobInvolvement -0.5994382 0.0940397 -6.374 1.84e-10 ***
## `JobRoleHuman Resources` 0.6930539 0.4965596 1.396 0.162801
## `JobRoleLaboratory Technician` 0.4686219 0.3739550 1.253 0.210151
## JobRoleManager -0.8986730 0.5258181 -1.709 0.087433 .
## `JobRoleManufacturing Director` -2.3114323 0.6083292 -3.800 0.000145 ***
## `JobRoleResearch Director` -1.7521890 0.5815209 -3.013 0.002586 **
## `JobRoleResearch Scientist` 0.0395487 0.3591233 0.110 0.912310
## `JobRoleSales Executive` 0.0637841 0.3466144 0.184 0.853997
## `JobRoleSales Representative` 1.8277922 0.4597748 3.975 7.03e-05 ***
## JobSatisfaction -0.4434666 0.0657139 -6.748 1.49e-11 ***
## MaritalStatusMarried 1.5368398 0.2681608 5.731 9.98e-09 ***
## MaritalStatusSingle 1.9403577 0.2788910 6.957 3.47e-12 ***
## NumCompaniesWorked 0.0339129 0.0274262 1.237 0.216268
## OverTimeYes 1.5631452 0.1848965 8.454 < 2e-16 ***
## TrainingTimesLastYear -0.2290195 0.0604145 -3.791 0.000150 ***
## YearsSinceLastPromotion 0.1126900 0.0240923 4.677 2.90e-06 ***
## YearsWithCurrManager -0.1057620 0.0232437 -4.550 5.36e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1204.05 on 869 degrees of freedom
## Residual deviance: 828.21 on 849 degrees of freedom
## AIC: 870.21
##
## Number of Fisher Scoring iterations: 5
predictions <- predict(logitmodel, cs2m)
confusionMatrix(table(predictions, cs2m$Attrition))
## Confusion Matrix and Statistics
##
##
## predictions No Yes
## No 537 31
## Yes 193 109
##
## Accuracy : 0.7425
## 95% CI : (0.7121, 0.7713)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3504
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7356
## Specificity : 0.7786
## Pos Pred Value : 0.9454
## Neg Pred Value : 0.3609
## Prevalence : 0.8391
## Detection Rate : 0.6172
## Detection Prevalence : 0.6529
## Balanced Accuracy : 0.7571
##
## 'Positive' Class : No
##
All parameters are significant and the results are a bit better than the NB model. Since we achieved higher accuracy, sensitivity, and specificity with the logit model, we should use that for our predictions of attrition based on this data.
For Salary, we can look back at the correlation matrix to see what variables are correlated with Salary and/or each other. Intuitively, Total Working Years and Job level appear to be the most correlated with Salary. Others like Years at the company/role/last promotion/with manager are correlated with job level and Working years; it looks like YearsAtCompany might be a good variable to use in their place.
Checking plots of correlated numeric variables. Adding a color for JobLevel helps identify separation by Job Level.
detach("package:klaR", unload = TRUE)
library(dplyr)
cs2m1 <- cs2m %>% dplyr::select(MonthlyIncome, JobLevel, TotalWorkingYears, YearsAtCompany, Age)
pairs(cs2m1, col=cs2m$JobLevel)
The most telling of these is TotalWorkingYears and JobLevel - plotting the TotalWorkingYears color coded by Job level looks promising.
plot(x=cs2m$TotalWorkingYears, y=cs2m$MonthlyIncome, col=cs2m$JobLevel, main ="Total Working Years grouped by Job Level vs Salary", xlab="Total Working Years", ylab="Monthly Income ($)")
It doesnt look like there is a clear trend between YearsAtCompany and MonthlyIncome.Age also looks spread out on the plot, but there is some trend. We can try to take the log of those variables to see if that helps.
cs2m1$log_YearsAtCompany <- log(cs2m1$YearsAtCompany)
cs2m1$log_Age <- log(cs2m1$Age)
pairs(cs2m1)
Taking the log of those variables didn’t make for much better plots, but we can add the original variables to the MLR model and see if they make a significant difference.
Using the variables selected from the initial analysis we can run a simple regression model.
model1 <- lm(MonthlyIncome~TotalWorkingYears+JobLevel+YearsAtCompany+Age, data=train)
summary(model1)
##
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel + YearsAtCompany +
## Age, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3988.9 -858.7 94.7 761.7 3927.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.844e+03 3.214e+02 -5.738 1.81e-08 ***
## TotalWorkingYears 6.782e+01 1.806e+01 3.755 0.000197 ***
## JobLevel 3.721e+03 9.348e+01 39.811 < 2e-16 ***
## YearsAtCompany -7.502e+00 1.454e+01 -0.516 0.606166
## Age 2.202e-02 9.964e+00 0.002 0.998238
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1362 on 430 degrees of freedom
## Multiple R-squared: 0.9216, Adjusted R-squared: 0.9209
## F-statistic: 1264 on 4 and 430 DF, p-value: < 2.2e-16
After looking at the results, the variables “YearsAtCompany” and “Age” are not statistically significant, so they can be removed.
model1 <- lm(MonthlyIncome~TotalWorkingYears+JobLevel, data=train)
summary(model1)
##
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3988.9 -869.8 102.0 767.4 3922.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1851.02 134.32 -13.781 < 2e-16 ***
## TotalWorkingYears 64.96 13.70 4.742 2.88e-06 ***
## JobLevel 3715.64 92.33 40.243 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1359 on 432 degrees of freedom
## Multiple R-squared: 0.9216, Adjusted R-squared: 0.9212
## F-statistic: 2539 on 2 and 432 DF, p-value: < 2.2e-16
#This is the RMSE on the training partition
training_RMSE <- sqrt(sum(model1$residuals^2) / model1$df)
print(paste("Training partition RMSE:", training_RMSE))
## [1] "Training partition RMSE: 1359.27044271327"
The adjusted R-sq of .921 is pretty good for just two predictors. We can check variance inflation factors for multicollinearity issues and the residual plots for any assumption violations.
vif(model1)
## TotalWorkingYears JobLevel
## 2.614616 2.614616
#checking residuals
par(mfrow=c(2,2))
plot(model1)
None of the VIFs appear too high (with only 2 variables, this should be expected). The residuals appear normally distributed and there aren’t any severe outliers. Shouldnt be any issues, so we can verify the model on the test set.
model1_preds <- predict(model1, test)
test_preds <- data.frame(model1_preds)
#test set RMSE
test_RMSE <- sqrt(sum((test_preds$model1_preds-test$MonthlyIncome)^2) / model1$df)
print(paste("Test partition RMSE:", test_RMSE))
## [1] "Test partition RMSE: 1429.75879901028"
The RMSE of the test set doesnt look too far off of the RMSE of the training set, so it appears to be a decent model.
This is a very simple model for interpretive purposes. We can easily determine Monthly salary with a relatively high degree of accuracy using only two easily identifiable continous variables - the total number of years worked and the job level. Since we ended on 2 predictor variables, we can even look at a 3D scatterplot to visualize our model on the data.
x <- seq(1, 40, length = 70)
y <- seq(1, 5, length = 70)
plane <- outer(x, y, function(a, b){summary(model1)$coef[1,1] +
summary(model1)$coef[2,1]*a + summary(model1)$coef[3,1]*b})
p <- plot_ly(data = cs2m, z = ~MonthlyIncome, x = ~TotalWorkingYears, y = ~JobLevel, opacity=.6) %>% add_markers()
p %>% add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE) %>% layout(title="MLR Model to predict Monthly Income")
The numeric variables have already been hashed out so we will only include those we know to be helpful. The categorical variables are somewhat difficult to tie to the continous response of MonthlyIncome. We can add in all of the categorical variables and use ASE plots of the test and training sets to see how many variables we can add before overfitting the model.
We can first pare down our training and test sets to just the variables we will test because we already decided which numeric variables are worth keeping.
#Select only variables we want to try so we dont have to write them all in the lm code
train2 <- train %>% dplyr::select(MonthlyIncome, JobLevel, TotalWorkingYears, BusinessTravel, Department, Education, EducationField, Gender, JobRole, StockOptionLevel)
test2 <- test %>% dplyr::select(MonthlyIncome, JobLevel, TotalWorkingYears, BusinessTravel, Department, Education, EducationField, Gender, JobRole, StockOptionLevel)
The code below will run a backward selection method that will use both our continous and categorical variables. It is set to include up to a maximum of 10 variables (all of them). The ASE of the train vs test set are compared to see how many variables can be added without overfitting.
#Forward selection variable selection
reg.bwd=regsubsets(MonthlyIncome~.,data=train2,method="backward", nvmax=10)
#prediction function
predict.regsubsets =function (object , newdata ,id ,...){
form=as.formula (object$call [[2]])
mat=model.matrix(form ,newdata )
coefi=coef(object ,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
#plot test and train ASE;***note index is to 15 since that what I set it in regsubsets
testASE<-c()
for (i in 1:10){
predictions<-predict.regsubsets(object=reg.bwd,newdata=test2,id=i)
testASE[i]<-mean((test2$MonthlyIncome-predictions)^2)
}
par(mfrow=c(1,1))
plot(1:10,testASE,type="l",xlab="# of predictors",ylab="ASE", ylim=c(500000,2000000), main="Test (black) vs Train (blue) ASE")
index<-which(testASE==min(testASE))
points(index,testASE[index],col="red",pch=10)
rss<-summary(reg.bwd)$rss
lines(1:10, rss/435,lty=3, col="blue") #Dividing by 435 since ASE=RSS/sample size
Based on the ASE comparison plot, the model doesnt improve after around 4 or 5 selection steps, so we can re-run the model using the first 5 steps and see what they are.
#final regression model
reg.final=regsubsets(MonthlyIncome~.,data=train2,method="forward",nvmax=5)
coef(reg.final,5)
## (Intercept) JobLevel
## -575.43848 2869.09292
## TotalWorkingYears JobRoleManager
## 53.78693 4154.62672
## JobRoleManufacturing Director JobRoleResearch Director
## 519.84594 4077.35525
The last 3 of the 5 variables were levels of the factor “JobRole”. So really we can just add JobRole to the model and see if we improved it from just including the 2 continous variables.
final.model<-lm(MonthlyIncome~JobLevel+TotalWorkingYears+JobRole,data=train2)
summary(final.model)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + TotalWorkingYears + JobRole,
## data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3156.1 -552.1 -54.3 587.2 4357.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -181.97 275.91 -0.659 0.5099
## JobLevel 2800.43 109.76 25.513 < 2e-16 ***
## TotalWorkingYears 54.09 10.47 5.165 3.7e-07 ***
## JobRoleHuman Resources 132.59 349.99 0.379 0.7050
## JobRoleLaboratory Technician -573.58 229.45 -2.500 0.0128 *
## JobRoleManager 4062.20 315.77 12.864 < 2e-16 ***
## JobRoleManufacturing Director 297.38 228.04 1.304 0.1929
## JobRoleResearch Director 3942.29 289.75 13.606 < 2e-16 ***
## JobRoleResearch Scientist -267.16 226.26 -1.181 0.2384
## JobRoleSales Executive -247.15 196.60 -1.257 0.2094
## JobRoleSales Representative -307.13 279.81 -1.098 0.2730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1004 on 424 degrees of freedom
## Multiple R-squared: 0.958, Adjusted R-squared: 0.957
## F-statistic: 967 on 10 and 424 DF, p-value: < 2.2e-16
confint(final.model)
## 2.5 % 97.5 %
## (Intercept) -724.29598 360.36488
## JobLevel 2584.67459 3016.17626
## TotalWorkingYears 33.50834 74.67774
## JobRoleHuman Resources -555.32979 820.51882
## JobRoleLaboratory Technician -1024.57712 -122.58171
## JobRoleManager 3441.52116 4682.87634
## JobRoleManufacturing Director -150.85553 745.60747
## JobRoleResearch Director 3372.77438 4511.80807
## JobRoleResearch Scientist -711.88196 177.56963
## JobRoleSales Executive -633.57443 139.27035
## JobRoleSales Representative -857.10907 242.85752
#Check Residuals
par(mfrow=c(2,2))
plot(final.model)
The residual plots give no reason for concern. With an adjusted R-sq of .957, the model is looking pretty good.
#test set predictions
fm_preds <- predict(final.model, test)
fmtest_preds <- data.frame(fm_preds)
#test set RMSE
fmtest_RMSE <- sqrt(sum((fmtest_preds$fm_preds-test2$MonthlyIncome)^2) / final.model$df)
print(paste("Test partition RMSE:", fmtest_RMSE))
## [1] "Test partition RMSE: 1148.28977858179"
The RMSE of the model on the test set looks better than our first model, so we can confidently say that adding the “JobRole” variable improves the predictive capability.
The final linear regression model to be used for predicting Monthly Income is a function of Job Role, Job Level, and Total Working years, which means that we still have a realtively simple and easily interpretable model for prediction.
For the classification (attrition) analysis: The requirement stipulates that the “training set” in addition to the “CaseStudy2CompSet No Attrition.csv” set need to have a sens/spec of > 60/60, the stats for the entire provided dataset are below. Then the predictions on the “CaseStudy2CompSet No Attrition.csv” are made using the logisitic regression model and set up for export.
predictions <- predict(logitmodel, cs2m)
confusionMatrix(table(predictions, cs2m$Attrition))
## Confusion Matrix and Statistics
##
##
## predictions No Yes
## No 537 31
## Yes 193 109
##
## Accuracy : 0.7425
## 95% CI : (0.7121, 0.7713)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3504
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7356
## Specificity : 0.7786
## Pos Pred Value : 0.9454
## Neg Pred Value : 0.3609
## Prevalence : 0.8391
## Detection Rate : 0.6172
## Detection Prevalence : 0.6529
## Balanced Accuracy : 0.7571
##
## 'Positive' Class : No
##
cs3 <- read.csv("CaseStudy2CompSet No Attrition.csv",header = TRUE)
Attrition <- predict(logitmodel, cs3)
ltest_preds <- data.frame(cs3$ID, Attrition)
#write.csv(ltest_preds, "Case2PredictionsEysenbach Attrition.csv")
For the regression (Salary) analysis: The requirement stipulates that the “training set” in addition to the “CaseStudy2CompSet No Salary.csv” set need an RMSE of <3000, the RMSE of the entire provided dataset is below. Then the predictions on the “CaseStudy2CompSet No Salary.csv” are made for export.
#RMSE if model is run on entire dataset
fullmodel<-lm(MonthlyIncome~JobLevel+TotalWorkingYears+JobRole,data=cs2)
training_RMSE <- sqrt(sum(fullmodel$residuals^2) / fullmodel$df)
print(paste("Training RMSE - full dataset:", training_RMSE))
## [1] "Training RMSE - full dataset: 1062.75699001351"
#predictions for "CaseStudy2CompSet No Salary.csv"
cs4 <- read.csv("CaseStudy2CompSet No Salary.csv",header = TRUE)
MonthlySalary <- predict(fullmodel, cs4)
ftest_preds <- data.frame(cs4$ID, MonthlySalary)
#write.csv(ftest_preds, "Case2PredictionsEysenbach Salary.csv")